This is a Rmd file analyzing our raw count data by the glmmSeq package as described in the vignette and manual.
sessionInfo() #provides list of loaded packages and version of R. I still have version 4.1 for now.
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] digest_0.6.31 R6_2.5.1 jsonlite_1.8.4 evaluate_0.20
## [5] cachem_1.0.7 rlang_1.1.0 cli_3.6.1 rstudioapi_0.14
## [9] jquerylib_0.1.4 bslib_0.4.2 rmarkdown_2.21 tools_4.1.2
## [13] xfun_0.38 yaml_2.3.7 fastmap_1.1.1 compiler_4.1.2
## [17] htmltools_0.5.5 knitr_1.42 sass_0.4.5
if ("rtracklayer" %in% rownames(installed.packages()) == 'FALSE') BiocManager::install("rtracklayer")
if ("goseq" %in% rownames(installed.packages()) == 'FALSE') BiocManager::install('goseq')
if ("rrvgo" %in% rownames(installed.packages()) == 'FALSE') BiocManager::install("rrvgo")
if ("GO.db" %in% rownames(installed.packages()) == 'FALSE') BiocManager::install("GO.db")
#BiocManager::install("org.Ce.eg.db", force=TRUE) #install if needed
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(goseq)
## Loading required package: BiasedUrn
## Loading required package: geneLenDataBase
##
library(rtracklayer)
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Warning: package 'S4Vectors' was built under R version 4.1.3
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:geneLenDataBase':
##
## unfactor
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## Loading required package: GenomeInfoDb
library(rrvgo)
library(GO.db)
## Loading required package: AnnotationDbi
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
##
## select
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:S4Vectors':
##
## expand
library(forcats)
sessionInfo() #list of packages after library-ing these packages
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] forcats_1.0.0 tidyr_1.3.0 GO.db_3.14.0
## [4] AnnotationDbi_1.56.2 Biobase_2.54.0 rrvgo_1.6.0
## [7] rtracklayer_1.54.0 GenomicRanges_1.46.1 GenomeInfoDb_1.30.1
## [10] IRanges_2.28.0 S4Vectors_0.32.4 BiocGenerics_0.40.0
## [13] goseq_1.46.0 geneLenDataBase_1.30.0 BiasedUrn_2.0.9
## [16] ggplot2_3.4.2 dplyr_1.1.1
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-162 bitops_1.0-7
## [3] matrixStats_0.63.0 bit64_4.0.5
## [5] RColorBrewer_1.1-3 filelock_1.0.2
## [7] progress_1.2.2 httr_1.4.5
## [9] tools_4.1.2 bslib_0.4.2
## [11] utf8_1.2.3 R6_2.5.1
## [13] DBI_1.1.3 mgcv_1.8-42
## [15] colorspace_2.1-0 withr_2.5.0
## [17] tidyselect_1.2.0 prettyunits_1.1.1
## [19] bit_4.0.5 curl_5.0.0
## [21] compiler_4.1.2 cli_3.6.1
## [23] NLP_0.2-1 xml2_1.3.3
## [25] DelayedArray_0.20.0 slam_0.1-50
## [27] sass_0.4.5 scales_1.2.1
## [29] tm_0.7-11 rappdirs_0.3.3
## [31] stringr_1.5.0 digest_0.6.31
## [33] Rsamtools_2.10.0 rmarkdown_2.21
## [35] XVector_0.34.0 pkgconfig_2.0.3
## [37] htmltools_0.5.5 MatrixGenerics_1.6.0
## [39] dbplyr_2.3.2 fastmap_1.1.1
## [41] rlang_1.1.0 rstudioapi_0.14
## [43] RSQLite_2.3.1 treemap_2.4-3
## [45] shiny_1.7.4 jquerylib_0.1.4
## [47] BiocIO_1.4.0 generics_0.1.3
## [49] jsonlite_1.8.4 BiocParallel_1.28.3
## [51] GOSemSim_2.20.0 RCurl_1.98-1.12
## [53] magrittr_2.0.3 GenomeInfoDbData_1.2.7
## [55] wordcloud_2.6 Matrix_1.4-1
## [57] Rcpp_1.0.10 munsell_0.5.0
## [59] fansi_1.0.4 lifecycle_1.0.3
## [61] stringi_1.7.12 yaml_2.3.7
## [63] SummarizedExperiment_1.24.0 zlibbioc_1.40.0
## [65] BiocFileCache_2.2.1 grid_4.1.2
## [67] blob_1.2.4 promises_1.2.0.1
## [69] parallel_4.1.2 ggrepel_0.9.3
## [71] crayon_1.5.2 lattice_0.20-45
## [73] Biostrings_2.62.0 splines_4.1.2
## [75] GenomicFeatures_1.46.5 hms_1.1.3
## [77] KEGGREST_1.34.0 knitr_1.42
## [79] pillar_1.9.0 igraph_1.4.1
## [81] rjson_0.2.21 biomaRt_2.50.3
## [83] XML_3.99-0.14 glue_1.6.2
## [85] evaluate_0.20 data.table_1.14.8
## [87] httpuv_1.6.9 png_0.1-8
## [89] vctrs_0.6.1 purrr_1.0.1
## [91] gtable_0.3.3 cachem_1.0.7
## [93] gridBase_0.4-7 xfun_0.38
## [95] mime_0.12 xtable_1.8-4
## [97] restfulr_0.0.15 later_1.3.0
## [99] pheatmap_1.0.12 tibble_3.2.1
## [101] GenomicAlignments_1.30.0 memoise_2.0.1
## [103] ellipsis_0.3.2
DEGs <- read.csv(file="../../output/signif_genes_normcts.csv", sep=',', header=TRUE) %>% dplyr::select(!c('X'))
#NOTE! This is not a file only with differentially expressed genes, this contains all of the genes in our dataset but also contains p-value information and fold change information to help determine which genes are signficant DEGs based on our model in glmmSeq
rownames(DEGs) <- DEGs$Gene
dim(DEGs)
## [1] 9011 75
Origin_DEGs <- DEGs %>% dplyr::filter(Origin < 0.05)
nrow(Origin_DEGs)
## [1] 847
genes <- rownames(DEGs)
Based off of Ariana’s script
Functional annotation file obtained from: http://cyanophora.rutgers.edu/Pocillopora_acuta/Pocillopora_acuta_HIv2.genes.EggNog_results.txt.gz on April 3, 2023.
Pacuta.annot <- read.delim("../../data/Pocillopora_acuta_HIv2.genes.EggNog_results.txt") %>% dplyr::rename("query" = X.query)
dim(Pacuta.annot)
## [1] 16784 21
head(Pacuta.annot)
## query seed_ortholog evalue
## 1 Pocillopora_acuta_HIv2___RNAseq.g24143.t1a 45351.EDO48725 2.16e-120
## 2 Pocillopora_acuta_HIv2___RNAseq.g18333.t1 45351.EDO38694 3.18e-123
## 3 Pocillopora_acuta_HIv2___RNAseq.g7985.t1 192875.XP_004345759.1 1.70e-183
## 4 Pocillopora_acuta_HIv2___RNAseq.g13511.t1 45351.EDO28722 2.94e-48
## 5 Pocillopora_acuta_HIv2___TS.g15308.t1 10224.XP_006813307.1 3.19e-20
## 6 Pocillopora_acuta_HIv2___RNAseq.g2057.t1 106582.XP_004573970.1 3.70e-14
## score
## 1 364.0
## 2 355.0
## 3 526.0
## 4 172.0
## 5 92.4
## 6 68.2
## eggNOG_OGs
## 1 COG0620@1|root,KOG2263@2759|Eukaryota,38GZS@33154|Opisthokonta,3BNKS@33208|Metazoa
## 2 COG0450@1|root,KOG0852@2759|Eukaryota,38B9P@33154|Opisthokonta,3BGS4@33208|Metazoa
## 3 COG3239@1|root,KOG4232@2759|Eukaryota,39UMW@33154|Opisthokonta
## 4 2ED36@1|root,2SITK@2759|Eukaryota,39IIK@33154|Opisthokonta,3C3PY@33208|Metazoa
## 5 COG2801@1|root,KOG0017@2759|Eukaryota,38F42@33154|Opisthokonta,3BA5H@33208|Metazoa
## 6 2CSTD@1|root,2S4A7@2759|Eukaryota,3A79X@33154|Opisthokonta,3BT5E@33208|Metazoa,3D9B1@33213|Bilateria,48FVM@7711|Chordata,49CYZ@7742|Vertebrata,4A886@7898|Actinopterygii
## max_annot_lvl COG_category
## 1 33208|Metazoa E
## 2 33208|Metazoa O
## 3 33154|Opisthokonta I
## 4 33208|Metazoa -
## 5 33208|Metazoa OU
## 6 33208|Metazoa S
## Description Preferred_name
## 1 Cobalamin-independent synthase, Catalytic domain -
## 2 negative regulation of male germ cell proliferation PRDX4
## 3 Fatty acid desaturase -
## 4 - -
## 5 K02A2.6-like -
## 6 Phosphatidylinositol N-acetylglucosaminyltransferase subunit Y PIGY
## GOs
## 1 -
## 2 GO:0000003,GO:0001775,GO:0002252,GO:0002263,GO:0002274,GO:0002275,GO:0002283,GO:0002366,GO:0002376,GO:0002443,GO:0002444,GO:0002446,GO:0003006,GO:0003674,GO:0003824,GO:0004601,GO:0005488,GO:0005515,GO:0005575,GO:0005576,GO:0005615,GO:0005622,GO:0005623,GO:0005737,GO:0005783,GO:0005790,GO:0005829,GO:0006082,GO:0006457,GO:0006464,GO:0006468,GO:0006520,GO:0006575,GO:0006793,GO:0006796,GO:0006807,GO:0006810,GO:0006887,GO:0006915,GO:0006950,GO:0006952,GO:0006955,GO:0006979,GO:0007154,GO:0007165,GO:0007249,GO:0007252,GO:0007275,GO:0007276,GO:0007283,GO:0007548,GO:0008150,GO:0008152,GO:0008219,GO:0008285,GO:0008379,GO:0008406,GO:0008584,GO:0009056,GO:0009266,GO:0009409,GO:0009605,GO:0009607,GO:0009617,GO:0009628,GO:0009636,GO:0009893,GO:0009966,GO:0009967,GO:0009987,GO:0010467,GO:0010604,GO:0010646,GO:0010647,GO:0010941,GO:0010942,GO:0010950,GO:0010952,GO:0012501,GO:0012505,GO:0016043,GO:0016192,GO:0016209,GO:0016310,GO:0016491,GO:0016684,GO:0016999,GO:0017001,GO:0017144,GO:0019222,GO:0019471,GO:0019538,GO:0019725,GO:0019752,GO:0019953,GO:0022414,GO:0022417,GO:0023051,GO:0023052,GO:0023056,GO:0030141,GO:0030162,GO:0030198,GO:0031323,GO:0031325,GO:0031410,GO:0031974,GO:0031982,GO:0031983,GO:0032268,GO:0032270,GO:0032501,GO:0032502,GO:0032504,GO:0032940,GO:0033554,GO:0034774,GO:0035556,GO:0036211,GO:0036230,GO:0042119,GO:0042127,GO:0042221,GO:0042592,GO:0042737,GO:0042742,GO:0042743,GO:0042744,GO:0042802,GO:0042803,GO:0042981,GO:0043062,GO:0043065,GO:0043067,GO:0043068,GO:0043085,GO:0043170,GO:0043207,GO:0043226,GO:0043227,GO:0043229,GO:0043231,GO:0043233,GO:0043280,GO:0043281,GO:0043299,GO:0043312,GO:0043412,GO:0043436,GO:0043900,GO:0043901,GO:0044093,GO:0044237,GO:0044238,GO:0044248,GO:0044260,GO:0044267,GO:0044281,GO:0044421,GO:0044422,GO:0044424,GO:0044433,GO:0044444,GO:0044446,GO:0044464,GO:0044703,GO:0045055,GO:0045137,GO:0045321,GO:0045454,GO:0045862,GO:0046425,GO:0046427,GO:0046546,GO:0046661,GO:0046903,GO:0046983,GO:0048232,GO:0048513,GO:0048518,GO:0048519,GO:0048522,GO:0048523,GO:0048583,GO:0048584,GO:0048608,GO:0048609,GO:0048731,GO:0048856,GO:0050789,GO:0050790,GO:0050794,GO:0050896,GO:0051171,GO:0051173,GO:0051179,GO:0051186,GO:0051187,GO:0051234,GO:0051239,GO:0051241,GO:0051246,GO:0051247,GO:0051336,GO:0051345,GO:0051604,GO:0051704,GO:0051707,GO:0051716,GO:0051920,GO:0052547,GO:0052548,GO:0055114,GO:0060205,GO:0060255,GO:0061458,GO:0065007,GO:0065008,GO:0065009,GO:0070013,GO:0070417,GO:0070887,GO:0071704,GO:0071840,GO:0072593,GO:0080090,GO:0097190,GO:0097237,GO:0097708,GO:0098542,GO:0098754,GO:0098869,GO:0099503,GO:0101002,GO:1901564,GO:1901605,GO:1902531,GO:1902533,GO:1904813,GO:1904892,GO:1904894,GO:1905936,GO:1905937,GO:1990748,GO:2000116,GO:2000241,GO:2000242,GO:2000254,GO:2000255,GO:2001056,GO:2001233,GO:2001235,GO:2001267,GO:2001269
## 3 -
## 4 -
## 5 -
## 6 GO:0000506,GO:0005575,GO:0005622,GO:0005623,GO:0005737,GO:0005783,GO:0005789,GO:0005886,GO:0006464,GO:0006497,GO:0006505,GO:0006506,GO:0006629,GO:0006643,GO:0006644,GO:0006650,GO:0006661,GO:0006664,GO:0006793,GO:0006796,GO:0006807,GO:0008150,GO:0008152,GO:0008610,GO:0008654,GO:0009058,GO:0009059,GO:0009247,GO:0009987,GO:0012505,GO:0016020,GO:0016254,GO:0019538,GO:0019637,GO:0031984,GO:0032991,GO:0034645,GO:0036211,GO:0042157,GO:0042158,GO:0042175,GO:0043170,GO:0043226,GO:0043227,GO:0043229,GO:0043231,GO:0043412,GO:0044237,GO:0044238,GO:0044249,GO:0044255,GO:0044260,GO:0044267,GO:0044422,GO:0044424,GO:0044425,GO:0044432,GO:0044444,GO:0044446,GO:0044464,GO:0045017,GO:0046467,GO:0046474,GO:0046486,GO:0046488,GO:0071704,GO:0071944,GO:0090407,GO:0098796,GO:0098827,GO:1901135,GO:1901137,GO:1901564,GO:1901566,GO:1901576,GO:1902494,GO:1903509,GO:1990234
## EC KEGG_ko
## 1 2.1.1.14 ko:K00549
## 2 1.11.1.15 ko:K03386
## 3 1.14.19.31 ko:K12418
## 4 - -
## 5 - -
## 6 - ko:K11001
## KEGG_Pathway
## 1 ko00270,ko00450,ko01100,ko01110,ko01230,map00270,map00450,map01100,map01110,map01230
## 2 ko04214,map04214
## 3 -
## 4 -
## 5 -
## 6 ko00563,ko01100,map00563,map01100
## KEGG_Module KEGG_Reaction KEGG_rclass
## 1 M00017 R04405,R09365 RC00035,RC00113,RC01241
## 2 - - -
## 3 - - -
## 4 - - -
## 5 - - -
## 6 - R05916 -
## BRITE KEGG_TC CAZy BiGG_Reaction
## 1 ko00000,ko00001,ko00002,ko01000 - - -
## 2 ko00000,ko00001,ko01000,ko04147 - - -
## 3 ko00000,ko01000,ko01004 - - -
## 4 - - - -
## 5 - - - -
## 6 ko00000,ko00001 - - -
## PFAMs
## 1 Meth_synt_2
## 2 1-cysPrx_C,AhpC-TSA
## 3 Cyt-b5,FA_desaturase
## 4 -
## 5 RVT_1,rve
## 6 PIG-Y
genes2annot = match(genes, Pacuta.annot$query) #match genes in DEGs (all genes after filtering) to genes in annotation file
sum(is.na(genes2annot)) #number of genes without EggNog annotation
## [1] 2812
missing<-as.data.frame(genes[which(is.na(genes2annot))]) #dataframe of genes without EggNog annotation
head(missing)
## genes[which(is.na(genes2annot))]
## 1 Pocillopora_acuta_HIv2___RNAseq.g7479.t3a
## 2 Pocillopora_acuta_HIv2___RNAseq.g7479.t3b
## 3 Pocillopora_acuta_HIv2___RNAseq.g682.t1
## 4 Pocillopora_acuta_HIv2___RNAseq.g4805.t1
## 5 Pocillopora_acuta_HIv2___RNAseq.g11061.t1
## 6 Pocillopora_acuta_HIv2___RNAseq.g16217.t1
2812/9011 genes without eggnog annotation
names(Pacuta.annot)
## [1] "query" "seed_ortholog" "evalue" "score"
## [5] "eggNOG_OGs" "max_annot_lvl" "COG_category" "Description"
## [9] "Preferred_name" "GOs" "EC" "KEGG_ko"
## [13] "KEGG_Pathway" "KEGG_Module" "KEGG_Reaction" "KEGG_rclass"
## [17] "BRITE" "KEGG_TC" "CAZy" "BiGG_Reaction"
## [21] "PFAMs"
geneInfo0 = data.frame(gene_id = genes, #add gene id
Accession = Pacuta.annot$seed_ortholog[genes2annot], #add accession number
Bitscore = Pacuta.annot$score[genes2annot], #add bitscore
eValue = Pacuta.annot$evalue[genes2annot], #add e value
Description = Pacuta.annot$Description[genes2annot], #add description of gene
Annotation.GO.ID = Pacuta.annot$GOs[genes2annot], #add GO ID's
q_Origin = DEGs$Origin, #add Origin adjusted p-value
q_Treatment = DEGs$Treatment, #add Treatment adjusted p-value
q_Interaction = DEGs$Treatment.Origin, #add Treatment:Origin adjusted p-value
Stable_OriginFC = DEGs$Stable_OriginFC, #add fold change for Slope vs Flat in the stable treatment
Variable_OriginFC = DEGs$Variable_OriginFC, #add fold change for Slope vs Flat in the variable treatment
maxGroupFC = DEGs$maxGroupFC, #add max group fold change (was FC bigger in stable of variable treatment)
col = DEGs$col) #add qualitative significance info
dim(geneInfo0)
## [1] 9011 13
head(geneInfo0,2)
## gene_id Accession Bitscore eValue
## 1 Pocillopora_acuta_HIv2___RNAseq.g27841.t1 45351.EDO35402 566 4.00e-197
## 2 Pocillopora_acuta_HIv2___RNAseq.g14011.t1 45351.EDO48592 449 1.17e-151
## Description
## 1 cyclin-dependent protein serine/threonine kinase activity
## 2 TAF6-like RNA polymerase II, p300 CBP-associated
## Annotation.GO.ID
## 1 GO:0000003,GO:0003006,GO:0003674,GO:0003824,GO:0004672,GO:0004674,GO:0004693,GO:0005575,GO:0005622,GO:0005623,GO:0005634,GO:0005737,GO:0005813,GO:0005815,GO:0005856,GO:0006464,GO:0006468,GO:0006793,GO:0006796,GO:0006807,GO:0007154,GO:0007165,GO:0007548,GO:0008150,GO:0008152,GO:0009987,GO:0015630,GO:0016301,GO:0016310,GO:0016740,GO:0016772,GO:0016773,GO:0019538,GO:0022414,GO:0023052,GO:0032502,GO:0036211,GO:0043170,GO:0043226,GO:0043227,GO:0043228,GO:0043229,GO:0043231,GO:0043232,GO:0043412,GO:0044237,GO:0044238,GO:0044260,GO:0044267,GO:0044422,GO:0044424,GO:0044430,GO:0044446,GO:0044464,GO:0050789,GO:0050794,GO:0050896,GO:0051716,GO:0051726,GO:0065007,GO:0071704,GO:0097472,GO:0140096,GO:1901564
## 2 GO:0000118,GO:0000123,GO:0003674,GO:0003676,GO:0003677,GO:0003712,GO:0003713,GO:0005488,GO:0005575,GO:0005622,GO:0005623,GO:0005634,GO:0005654,GO:0005730,GO:0006325,GO:0006338,GO:0006355,GO:0006357,GO:0006464,GO:0006473,GO:0006475,GO:0006807,GO:0006996,GO:0008150,GO:0008152,GO:0009889,GO:0009891,GO:0009893,GO:0009987,GO:0010468,GO:0010556,GO:0010557,GO:0010604,GO:0016043,GO:0016569,GO:0016570,GO:0016573,GO:0016604,GO:0016607,GO:0018193,GO:0018205,GO:0018393,GO:0018394,GO:0019219,GO:0019222,GO:0019538,GO:0030914,GO:0031248,GO:0031323,GO:0031325,GO:0031326,GO:0031328,GO:0031974,GO:0031981,GO:0032991,GO:0036211,GO:0043170,GO:0043226,GO:0043227,GO:0043228,GO:0043229,GO:0043231,GO:0043232,GO:0043233,GO:0043412,GO:0043543,GO:0043966,GO:0044237,GO:0044238,GO:0044260,GO:0044267,GO:0044422,GO:0044424,GO:0044428,GO:0044446,GO:0044451,GO:0044464,GO:0045935,GO:0048518,GO:0048522,GO:0050789,GO:0050794,GO:0051171,GO:0051173,GO:0051252,GO:0051254,GO:0051276,GO:0060255,GO:0065007,GO:0070013,GO:0070461,GO:0071704,GO:0071840,GO:0080090,GO:0097159,GO:0140110,GO:1901363,GO:1901564,GO:1902493,GO:1902494,GO:1902680,GO:1903506,GO:1903508,GO:1990234,GO:2000112,GO:2001141
## q_Origin q_Treatment q_Interaction Stable_OriginFC Variable_OriginFC
## 1 0.3312892 0.9999997 0.9995411 -0.2177825 -0.1121965
## 2 0.1507550 0.9999997 0.9995411 0.6976765 0.2428565
## maxGroupFC col
## 1 Stable Not Significant
## 2 Stable Not Significant
Add KEGG annotation information. Downloaded from: http://cyanophora.rutgers.edu/Pocillopora_acuta/Pocillopora_acuta_HIv2.genes.KEGG_results.txt.gz on April 3, 2023.
kegg<-read.table("../../data/Pocillopora_acuta_HIv2.genes.KEGG_results.txt", sep="", quote="", na.strings=c("","NA"), blank.lines.skip = FALSE, header=FALSE)
dim(kegg)
## [1] 33730 2
head(kegg)
## V1 V2
## 1 Pocillopora_acuta_HIv2___RNAseq.g24143.t1a <NA>
## 2 Pocillopora_acuta_HIv2___RNAseq.g24143.t1b K22584
## 3 Pocillopora_acuta_HIv2___RNAseq.g22918.t1 <NA>
## 4 Pocillopora_acuta_HIv2___RNAseq.g18333.t1 K03386
## 5 Pocillopora_acuta_HIv2___RNAseq.g7985.t1 <NA>
## 6 Pocillopora_acuta_HIv2___RNAseq.g13511.t1 <NA>
Add KEGG annotations to each gene.
geneInfo0$KEGG <- kegg$V2[match(geneInfo0$gene_id, kegg$V1)]
Order geneInfo0 by significance of Origin, q_Origin (adjusted p value)
geneInfo <- geneInfo0[order(geneInfo0[, 'q_Origin']), ]
write.csv(geneInfo, file = "../../output/geneInfo.csv") #gene info for reference/supplement
#geneInfo<-read.csv("../../output/geneInfo.csv")
dim(geneInfo)
## [1] 9011 14
Get gene length information.
#import file
gff <- rtracklayer::import("../../data/Pocillopora_acuta_HIv2.genes_fixed.gff3") #if this doesn't work, restart R and try again
transcripts <- subset(gff, type == "transcript") #keep only transcripts , not CDS or exons
transcript_lengths <- width(transcripts) #isolate length of each gene
seqnames<-transcripts$ID #extract list of gene id
lengths<-cbind(seqnames, transcript_lengths)
lengths<-as.data.frame(lengths) #convert to data frame
geneInfo$Length<-lengths$transcript_lengths[match(geneInfo$gene_id, lengths$seqnames)] #Add in length to geneInfo
Format GO terms to remove dashes and quotes and separate by semicolons (replace , with ;) in Annotation.GO.ID column
geneInfo$Annotation.GO.ID <- gsub(",", ";", geneInfo$Annotation.GO.ID)
geneInfo$Annotation.GO.ID <- gsub('"', "", geneInfo$Annotation.GO.ID)
geneInfo$Annotation.GO.ID <- gsub("-", NA, geneInfo$Annotation.GO.ID)
### Generate vector with names of all genes
ALL.vector <- c(geneInfo$gene_id)
### Generate length vector for all genes
LENGTH.vector <- as.integer(geneInfo$Length)
### Generate vector with names of the 847 significant DEGs by Origin
ID.vector <- geneInfo %>%
filter(q_Origin < 0.05) %>%
pull(gene_id)
##Get a list of GO Terms for the 847 significant DEGs by Origin
GO.terms <- geneInfo %>%
filter(q_Origin < 0.05) %>%
dplyr::select(gene_id, Annotation.GO.ID)
##Format to have one goterm per row with gene ID repeated
split <- strsplit(as.character(GO.terms$Annotation.GO.ID), ";")
split2 <- data.frame(v1 = rep.int(GO.terms$gene, sapply(split, length)), v2 = unlist(split))
colnames(split2) <- c("gene", "Annotation.GO.ID")
GO.terms <- split2
##Construct list of genes with 1 for genes in that are significant for Origin and 0 for those that are not
gene.vector = as.integer(ALL.vector %in% ID.vector) #since we ordered geneInfo by q_Origin, this puts a "1" for the first 847 genes, which are the same genes in ID.vector
names(gene.vector) <- ALL.vector #set names
#weight gene vector by bias for length of gene
pwf <- nullp(gene.vector, ID.vector, bias.data = LENGTH.vector)
#run goseq using Wallenius method for all categories of GO terms
GO.wall <- goseq(pwf, ID.vector, gene2cat=GO.terms, method="Wallenius", use_genes_without_cat=TRUE)
## Using manually entered categories.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
GO <- GO.wall[order(GO.wall$over_represented_pvalue),]
colnames(GO)[1] <- "GOterm"
#adjust p-values
GO$bh_adjust <- p.adjust(GO$over_represented_pvalue, method="BH") #add adjusted p-values
#Write file of results
write.csv(GO, "../../output/GOSeq/GOseq_DEG_Origin.csv")
#Filtering for p < 0.05
GO_05 <- GO %>%
dplyr::filter(bh_adjust<0.05) %>%
dplyr::arrange(., ontology, bh_adjust)
#Filtering for p < 0.05
GO_001 <- GO %>%
dplyr::filter(bh_adjust<0.001) %>%
dplyr::arrange(., ontology, bh_adjust)
Plotting!
ontologies<-c("BP", "CC", "MF")
for (category in ontologies) {
go_results <- GO_05
go_results<-go_results%>%
filter(ontology==category)%>%
filter(bh_adjust != "NA") %>%
filter(numInCat>10)%>%
arrange(., bh_adjust)
#Reduce/collapse GO term set with the rrvgo package
simMatrix <- calculateSimMatrix(go_results$GOterm,
orgdb="org.Ce.eg.db", #c. elegans database
ont=category,
method="Rel")
#calculate similarity
scores <- setNames(-log(go_results$bh_adjust), go_results$GOterm)
reducedTerms <- reduceSimMatrix(simMatrix,
scores,
threshold=0.7,
orgdb="org.Ce.eg.db")
#keep only the goterms from the reduced list
go_results<-go_results%>%
filter(GOterm %in% reducedTerms$go)
#add in parent terms to list of go terms
go_results$ParentTerm<-reducedTerms$parentTerm[match(go_results$GOterm, reducedTerms$go)]
#plot significantly enriched GO terms by Slim Category faceted by slim term
GO.plot <- ggplot2::ggplot(go_results, aes(x = ontology, y = term)) +
geom_point(aes(size=bh_adjust)) +
scale_size(name="Over rep. p-value", trans="reverse", range=c(1,3))+
facet_grid(ParentTerm ~ ., scales = "free", labeller = label_wrap_gen(width = 5, multi_line = TRUE))+
theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"),
strip.text.y = element_text(angle=0, size = 10),
strip.text.x = element_text(size = 20),
axis.text = element_text(size = 8),
axis.title.x = element_blank(),
axis.title.y = element_blank())
print(GO.plot)
ggsave(filename=paste0("../../output/GOSeq/OriginDEGs_P05", "_", category, ".png"), plot=GO.plot, dpi=100, width=12, height=100, units="in", limitsize=FALSE)
}
GO_plot_all <- GO_001 %>% drop_na(ontology) %>% mutate(term = fct_reorder(term, numDEInCat)) %>%
mutate(term = fct_reorder(term, ontology)) %>%
ggplot( aes(x=term, y=numDEInCat) ) +
geom_segment( aes(x=term ,xend=term, y=0, yend=numDEInCat), color="grey") +
geom_text(aes(label = over_represented_pvalue), hjust = -1, vjust = 0, size = 2) +
geom_point(size=1, aes(colour = ontology)) +
coord_flip() +
ylim(0,305) +
theme(
panel.grid.minor.y = element_blank(),
panel.grid.major.y = element_blank(),
legend.position="bottom"
) +
xlab("") +
ylab("") +
theme_bw() + #Set background color
theme(panel.border = element_blank(), # Set border
panel.grid.major = element_blank(), #Set major gridlines
panel.grid.minor = element_blank(), #Set minor gridlines
axis.line = element_line(colour = "black"), #Set axes color
plot.background=element_blank()) #Set the plot background #set title attributes
GO_plot_all
ggsave("../../output/GOSeq/OriginDEGs_P001.pdf", GO_plot_all, width = 28, height = 175, units = c("in"), dpi=100, limitsize=FALSE)
#ordered by p-value
GO_plot_all_pval <- GO_001 %>% drop_na(ontology) %>% mutate(term = fct_reorder(term, over_represented_pvalue)) %>%
mutate(term = fct_reorder(term, ontology)) %>%
ggplot( aes(x=term, y=numDEInCat) ) +
geom_segment( aes(x=term ,xend=term, y=0, yend=numDEInCat), color="grey") +
geom_text(aes(label = over_represented_pvalue), hjust = -1, vjust = 0, size = 2) +
geom_point(size=1, aes(colour = ontology)) +
coord_flip() +
ylim(0,305) +
theme(
panel.grid.minor.y = element_blank(),
panel.grid.major.y = element_blank(),
legend.position="bottom"
) +
xlab("") +
ylab("") +
theme_bw() + #Set background color
theme(panel.border = element_blank(), # Set border
panel.grid.major = element_blank(), #Set major gridlines
panel.grid.minor = element_blank(), #Set minor gridlines
axis.line = element_line(colour = "black"), #Set axes color
plot.background=element_blank()) #Set the plot background #set title attributes
GO_plot_all_pval
ggsave("../../output/GOSeq/OriginDEGs_P001_orderedpval.pdf", GO_plot_all_pval, width = 28, height = 175, units = c("in"), dpi=100, limitsize=FALSE)
##Get a list of KEGG Terms for the 847 significant DEGs by Origin
KO.terms <- geneInfo %>%
filter(q_Origin < 0.05) %>%
dplyr::select(gene_id, KEGG)
#run goseq using Wallenius method for all categories of KEGG terms
KO.wall <-
goseq(
pwf,
ID.vector,
gene2cat = KO.terms,
test.cats = c("KEGG"),
method = "Wallenius",
use_genes_without_cat = TRUE
)
## Using manually entered categories.
## Calculating the p-values...
KO <- KO.wall[order(KO.wall$over_represented_pvalue), ]
colnames(KO)[1] <- "KEGGterm"
#adjust p-values
KO$bh_adjust <- p.adjust(KO$over_represented_pvalue, method = "BH") #add adjusted p-values
#Filtering for p < 0.05
KO_05 <- KO %>%
dplyr::filter(bh_adjust < 0.05) %>%
dplyr::arrange(., bh_adjust)
head(KO_05)
## KEGGterm over_represented_pvalue under_represented_pvalue numDEInCat numInCat
## 1 K17592 2.325271e-06 1 5 5
## bh_adjust
## 1 0.0007882667
#Write file of results
write.csv(KO, "../../output/GOSeq/GOseq_KEGG_Origin.csv")
Info about this KEGG term: K17592: SACS; sacsin
### Generate vector with names of all genes
ALL.vector <- c(geneInfo$gene_id)
### Generate length vector for all genes
LENGTH.vector <- as.integer(geneInfo$Length)
### Generate vector with names of the 847 significant DEGs by Origin
ID.vector_upSlope <- geneInfo %>%
filter(q_Origin < 0.05) %>%
filter(Stable_OriginFC > 0 & Variable_OriginFC > 0) %>%
pull(gene_id)
##Get a list of GO Terms for the 847 significant DEGs by Origin
GO.terms_upSlope <- geneInfo %>%
filter(q_Origin < 0.05) %>%
filter(Stable_OriginFC > 0 & Variable_OriginFC > 0) %>%
dplyr::select(gene_id, Annotation.GO.ID)
dim(GO.terms_upSlope) #358 genes are upregulated in the Slope Origin in both treatments and significant by Origin
## [1] 358 2
##Format to have one goterm per row with gene ID repeated
split <- strsplit(as.character(GO.terms_upSlope$Annotation.GO.ID), ";")
split2 <- data.frame(v1 = rep.int(GO.terms_upSlope$gene, sapply(split, length)), v2 = unlist(split))
colnames(split2) <- c("gene", "Annotation.GO.ID")
GO.terms_upSlope <- split2
##Construct list of genes with 1 for genes in that are significant for Origin and upregulated in Slope and 0 for those that are not
gene.vector_upSlope = as.integer(ALL.vector %in% ID.vector_upSlope)
names(gene.vector_upSlope) <- ALL.vector #set names
#weight gene vector by bias for length of gene
pwf_upSlope <- nullp(gene.vector_upSlope, ID.vector_upSlope, bias.data = LENGTH.vector)
#run goseq using Wallenius method for all categories of GO terms
GO.wall_upSlope <- goseq(pwf_upSlope, ID.vector_upSlope, gene2cat=GO.terms_upSlope, method="Wallenius", use_genes_without_cat=TRUE)
## Using manually entered categories.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
GO_upSlope <- GO.wall_upSlope[order(GO.wall_upSlope$over_represented_pvalue),]
colnames(GO_upSlope)[1] <- "GOterm"
#adjust p-values
GO_upSlope$bh_adjust <- p.adjust(GO_upSlope$over_represented_pvalue, method="BH") #add adjusted p-values
#Write file of results
write.csv(GO_upSlope, "../../output/GOSeq/GOseq_DEG_Origin_upSlope.csv")
#Filtering for p < 0.05
GO_upSlope_05 <- GO_upSlope %>%
dplyr::filter(bh_adjust<0.05) %>%
dplyr::arrange(., ontology, bh_adjust)
#Filtering for p < 0.05
GO_upSlope_001 <- GO_upSlope %>%
dplyr::filter(bh_adjust<0.001) %>%
dplyr::arrange(., ontology, bh_adjust)
Plotting!
ontologies<-c("BP", "CC", "MF")
for (category in ontologies) {
go_results <- GO_upSlope_05
go_results<-go_results%>%
filter(ontology==category)%>%
filter(bh_adjust != "NA") %>%
filter(numInCat>10)%>%
arrange(., bh_adjust)
#Reduce/collapse GO term set with the rrvgo package
simMatrix <- calculateSimMatrix(go_results$GOterm,
orgdb="org.Ce.eg.db", #c. elegans database
ont=category,
method="Rel")
#calculate similarity
scores <- setNames(-log(go_results$bh_adjust), go_results$GOterm)
reducedTerms <- reduceSimMatrix(simMatrix,
scores,
threshold=0.7,
orgdb="org.Ce.eg.db")
#keep only the goterms from the reduced list
go_results<-go_results%>%
filter(GOterm %in% reducedTerms$go)
#add in parent terms to list of go terms
go_results$ParentTerm<-reducedTerms$parentTerm[match(go_results$GOterm, reducedTerms$go)]
#plot significantly enriched GO terms by Slim Category faceted by slim term
GO.plot_upslope <- ggplot2::ggplot(go_results, aes(x = ontology, y = term)) +
geom_point(aes(size=bh_adjust)) +
scale_size(name="Over rep. p-value", trans="reverse", range=c(1,3))+
facet_grid(ParentTerm ~ ., scales = "free", labeller = label_wrap_gen(width = 5, multi_line = TRUE))+
theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"),
strip.text.y = element_text(angle=0, size = 10),
strip.text.x = element_text(size = 20),
axis.text = element_text(size = 8),
axis.title.x = element_blank(),
axis.title.y = element_blank())
print(GO.plot_upslope)
ggsave(filename=paste0("../../output/GOSeq/OriginDEGs_upslope_P05", "_", category, ".png"), plot=GO.plot_upslope, dpi=100, width=12, height=100, units="in", limitsize=FALSE)
}
##
## preparing gene to GO mapping data...
## preparing IC data...
## preparing gene to GO mapping data...
## preparing IC data...
## preparing gene to GO mapping data...
## preparing IC data...
### Generate vector with names of all genes
ALL.vector <- c(geneInfo$gene_id)
### Generate length vector for all genes
LENGTH.vector <- as.integer(geneInfo$Length)
### Generate vector with names of the 847 significant DEGs by Origin
ID.vector_upFlat <- geneInfo %>%
filter(q_Origin < 0.05) %>%
filter(Stable_OriginFC < 0 & Variable_OriginFC < 0) %>%
pull(gene_id)
##Get a list of GO Terms for the 847 significant DEGs by Origin
GO.terms_upFlat <- geneInfo %>%
filter(q_Origin < 0.05) %>%
filter(Stable_OriginFC < 0 & Variable_OriginFC < 0) %>%
dplyr::select(gene_id, Annotation.GO.ID)
dim(GO.terms_upFlat) #___ genes are upregulated in the Flat Origin in both treatments and significant by Origin
## [1] 480 2
##Format to have one goterm per row with gene ID repeated
split <- strsplit(as.character(GO.terms_upFlat$Annotation.GO.ID), ";")
split2 <- data.frame(v1 = rep.int(GO.terms_upFlat$gene, sapply(split, length)), v2 = unlist(split))
colnames(split2) <- c("gene", "Annotation.GO.ID")
GO.terms_upFlat <- split2
##Construct list of genes with 1 for genes in that are significant for Origin and upregulated in Flat and 0 for those that are not
gene.vector_upFlat = as.integer(ALL.vector %in% ID.vector_upFlat)
names(gene.vector_upFlat) <- ALL.vector #set names
#weight gene vector by bias for length of gene
pwf_upFlat <- nullp(gene.vector_upFlat, ID.vector_upFlat, bias.data = LENGTH.vector)
#run goseq using Wallenius method for all categories of GO terms
GO.wall_upFlat <- goseq(pwf_upFlat, ID.vector_upFlat, gene2cat=GO.terms_upFlat, method="Wallenius", use_genes_without_cat=TRUE)
## Using manually entered categories.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
GO_upFlat <- GO.wall_upFlat[order(GO.wall_upFlat$over_represented_pvalue),]
colnames(GO_upFlat)[1] <- "GOterm"
#adjust p-values
GO_upFlat$bh_adjust <- p.adjust(GO_upFlat$over_represented_pvalue, method="BH") #add adjusted p-values
#Write file of results
write.csv(GO_upFlat, "../../output/GOSeq/GOseq_DEG_Origin_upFlat.csv")
#Filtering for p < 0.05
GO_upFlat_05 <- GO_upFlat %>%
dplyr::filter(bh_adjust<0.05) %>%
dplyr::arrange(., ontology, bh_adjust)
#Filtering for p < 0.05
GO_upFlat_001 <- GO_upFlat %>%
dplyr::filter(bh_adjust<0.001) %>%
dplyr::arrange(., ontology, bh_adjust)
Plotting!
ontologies<-c("BP", "CC", "MF")
for (category in ontologies) {
go_results <- GO_upFlat_05
go_results<-go_results%>%
filter(ontology==category)%>%
filter(bh_adjust != "NA") %>%
filter(numInCat>10)%>%
arrange(., bh_adjust)
#Reduce/collapse GO term set with the rrvgo package
simMatrix <- calculateSimMatrix(go_results$GOterm,
orgdb="org.Ce.eg.db", #c. elegans database
ont=category,
method="Rel")
#calculate similarity
scores <- setNames(-log(go_results$bh_adjust), go_results$GOterm)
reducedTerms <- reduceSimMatrix(simMatrix,
scores,
threshold=0.7,
orgdb="org.Ce.eg.db")
#keep only the goterms from the reduced list
go_results<-go_results%>%
filter(GOterm %in% reducedTerms$go)
#add in parent terms to list of go terms
go_results$ParentTerm<-reducedTerms$parentTerm[match(go_results$GOterm, reducedTerms$go)]
#plot significantly enriched GO terms by Slim Category faceted by slim term
GO.plot_upFlat <- ggplot2::ggplot(go_results, aes(x = ontology, y = term)) +
geom_point(aes(size=bh_adjust)) +
scale_size(name="Over rep. p-value", trans="reverse", range=c(1,3))+
facet_grid(ParentTerm ~ ., scales = "free", labeller = label_wrap_gen(width = 5, multi_line = TRUE))+
theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"),
strip.text.y = element_text(angle=0, size = 10),
strip.text.x = element_text(size = 20),
axis.text = element_text(size = 8),
axis.title.x = element_blank(),
axis.title.y = element_blank())
print(GO.plot_upFlat)
ggsave(filename=paste0("../../output/GOSeq/OriginDEGs_upFlat_P05", "_", category, ".png"), plot=GO.plot_upFlat, dpi=100, width=12, height=100, units="in", limitsize=FALSE)
}
## preparing gene to GO mapping data...
## preparing IC data...
## preparing gene to GO mapping data...
## preparing IC data...
## preparing gene to GO mapping data...
## preparing IC data...